{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# -*- encoding: utf-8 -*-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "reload(sys)\n",
    "sys.setdefaultencoding('utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from gensim import corpora,models,similarities,utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 获取训练数据\n",
    "def getTrainSet(inFile):\n",
    "    # 文章标题集\n",
    "    title_set = []\n",
    "    # 训练集\n",
    "    train_set=[]\n",
    "    # 读入训练数据\n",
    "    f=open(inFile)\n",
    "    lines=f.readlines()\n",
    "    for line in lines:\n",
    "        article = line.replace('\\n','').split('\\t')\n",
    "        title = article[0]\n",
    "        title_set.append(title)\n",
    "        content = article[1:]\n",
    "        train_set.append(content)\n",
    "    f.close()\n",
    "    return (title_set,train_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load TFIDF模型\n",
    "def loadModelTFIDF(mdlFile,dicFile,idxFile):\n",
    "    #载入字典\n",
    "    dictionary = corpora.Dictionary.load(dicFile)\n",
    "    #载入TFIDF模型和索引\n",
    "    tfidfModel = models.TfidfModel.load(mdlFile)\n",
    "    indexTfidf = similarities.MatrixSimilarity.load(idxFile)\n",
    "    \n",
    "    return (tfidfModel,dictionary,indexTfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 利用TFIDF模型计算相似度\n",
    "def simTFIDF(test_data,tfidfModel,dictionary,indexTfidf):\n",
    "  \n",
    "    # 处理测试数据\n",
    "    query_bow = dictionary.doc2bow(test_data)\n",
    "\n",
    "    #使用TFIDF模型向量化\n",
    "    tfidfvect = tfidfModel[query_bow]\n",
    "    \n",
    "    print tfidfvect\n",
    "    \n",
    "    #TFIDF相似性\n",
    "    simstfidf = indexTfidf[tfidfvect]\n",
    "    \n",
    "    return simstfidf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load LDA模型\n",
    "def loadModelLDA(ldaMDL,dicFile,idxFile):\n",
    "    #载入字典\n",
    "    dictionary = corpora.Dictionary.load(dicFile)\n",
    "    #载入TFIDF模型\n",
    "#     tfidfModel = models.TfidfModel.load(tfidfMDL)\n",
    "    #载入LDA模型和索引\n",
    "    ldaModel = models.LdaModel.load(ldaMDL)\n",
    "    indexLDA = similarities.MatrixSimilarity.load(idxFile)\n",
    "    \n",
    "    return (ldaModel,dictionary,indexLDA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 利用LDA模型计算相似度\n",
    "def simLDA(test_data,tfidfModel,dictionary,ldaModel,indexLDA):\n",
    "    \n",
    "    # 处理测试数据\n",
    "    query_bow = dictionary.doc2bow(test_data)\n",
    "    #使用TFIDF模型向量化\n",
    "    tfidfvect = tfidfModel[query_bow]\n",
    "    #然后LDA向量化,因为我们训练时的LDA是在TFIDF基础上做的,所以用itidfvect再向量化一次\n",
    "    ldavec = ldaModel[tfidfvect]\n",
    "\n",
    "    #LDA相似性\n",
    "    simlda = indexLDA[ldavec]\n",
    "\n",
    "    return simlda"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 获取相似文章的题目\n",
    "def getTitleIdx(simValue,TitleIdx):\n",
    "    \n",
    "    # 标题索引\n",
    "    title_set = []\n",
    "    result = []\n",
    "    # 读入训练数据\n",
    "    f=open(TitleIdx)\n",
    "    lines=f.readlines()\n",
    "    for line in lines:\n",
    "        title_set.append(line.replace('\\n',''))\n",
    "    f.close()\n",
    "    \n",
    "    for item in simValue:\n",
    "        idx,value = item\n",
    "        print \"========================\"\n",
    "        print \"similarity article :%s,simvalue= %s\" % (title_set[idx].split(\"\\t\")[0],value)\n",
    "        print \"article content : \",title_set[idx].split(\"\\t\")[1]\n",
    "        print \"========================\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 计算相似度\n",
    "def getSimTFIDF(model_info,inFile,TitleIdx):\n",
    "    \n",
    "    title_set,test_set = getTrainSet(inFile)\n",
    "\n",
    "    mdl,dic,idx = model_info\n",
    "    \n",
    "    # load module\n",
    "    tfidfModel,dict_tfidf,indexTfidf = loadModelTFIDF(mdl,dic,idx)\n",
    "    \n",
    "    for i,text in enumerate(test_set):\n",
    "        # 返回相似度最高的前5篇文章\n",
    "        # TFIDF 模型\n",
    "        simValue = sorted(enumerate(simTFIDF(text,tfidfModel,dict_tfidf,indexTfidf)),key=lambda item: -item[1])[:5]\n",
    "        print \"@@@@@@@@@@@@@@@@@@@@@@@@@@\"\n",
    "        print \"Test Article = \",title_set[i]\n",
    "        print \"TFIDF Similarity =%s\" % (simValue)\n",
    "        getTitleIdx(simValue,TitleIdx)\n",
    "        print \"@@@@@@@@@@@@@@@@@@@@@@@@@@\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 计算相似度\n",
    "def getSimLDA(model_info,inFile,TitleIdx):\n",
    "    \n",
    "    title_set,test_set = getTrainSet(inFile)\n",
    "\n",
    "    tfidf_info,lda_info = model_info\n",
    "    \n",
    "    tfidf_mdl,tfidf_dic,tfidf_idx = tfidf_info\n",
    "    LDA_mdl,LDA_dic,LDA_idx = lda_info\n",
    "    \n",
    "    # load module\n",
    "    tfidfModel,dict_tfidf,indexTfidf = loadModelTFIDF(tfidf_mdl,tfidf_dic,tfidf_idx)\n",
    "    # load module\n",
    "    ldaModel,dict_lda,indexLDA = loadModelLDA(LDA_mdl,LDA_dic,LDA_idx)\n",
    "    \n",
    "    for i,text in enumerate(test_set):\n",
    "        # 返回相似度最高的前5篇文章\n",
    "        # LDA 模型\n",
    "        simValue = sorted(enumerate(simLDA(text,tfidfModel,dict_lda,ldaModel,indexLDA)),key=lambda item: -item[1])[:5]\n",
    "        print \"@@@@@@@@@@@@@@@@@@@@@@@@@@\"\n",
    "        print \"Test Article = \",title_set[i]\n",
    "        print \"LDA Similarity =%s\" % (simValue)\n",
    "        getTitleIdx(simValue,TitleIdx)\n",
    "        print \"@@@@@@@@@@@@@@@@@@@@@@@@@@\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def main():\n",
    "    \n",
    "    tfidf_info = (\"./model/all_test_TFIDF.mdl\",\n",
    "                 \"./model/all_test_TFIDF.dic\",\n",
    "                 \"./model/all_test_TFIDF.idx\")\n",
    "    \n",
    "    lda_info = (\"./model/all_test_LDA50TOPIC.mdl\",\n",
    "               \"./model/all_test_LDA50TOPIC.dic\",\n",
    "               \"./model/all_test_LDA50TOPIC.idx\")\n",
    "    \n",
    "    inFile = \"./data/test.txt\"\n",
    "    \n",
    "    TitleIdx = \"./data/processed/all_merge.txt\"\n",
    "    \n",
    "    # 利用TFIDF 计算相似度\n",
    "    getSimTFIDF(tfidf_info,inFile,TitleIdx)\n",
    "    \n",
    "    # 利用LDA 计算相似度\n",
    "    getSimLDA((tfidf_info,lda_info),inFile,TitleIdx)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
